rm(list = ls())
gc(verbose = FALSE)

args = commandArgs(trailingOnly = "TRUE")
args

arg1 <- args[1]
arg2 <- args[2]

# Define library folder
setwd(arg1)
.libPaths(arg2)
.libPaths(.libPaths()[1])

pkg <- c("nnls", "SuperLearner", "dplyr", "arm", "onehot", "grf", "xgboost", "caret", "data.table",
         "knitr", "kableExtra", "magrittr", "kimisc", "data.table", "readstata13", "DescTools", "scales", "ggplot2")
lapply(pkg, library, character.only = T, lib.loc = arg2)


# read the data
ihwap_complete = read.csv(paste0(arg1, "/Data/ihwap_full.csv"))


# drop some variables not to be used
ihwap_full = subset(ihwap_complete, select = -c(Household, ExistingConsumption, ProjectedConsumption, end_day, kfold, kfold2, rowid))
ihwap_completeX = subset(ihwap_complete, select = -c(Household, ExistingConsumption, ProjectedConsumption, end_day, kfold, kfold2, rowid, total_mmbtu, treated))

# will train model only with untreated homes
ihwap_full = ihwap_full[which(ihwap_full$treated==0), ]

### Data Frames for the models
# dependent variable
train_y = as.matrix(subset(ihwap_full, select = c("total_mmbtu")))
# independent variables
train_x = subset(ihwap_full, select = -c(total_mmbtu, treated))

######## Setup SuperLearner
# Function to create a list of model names
expandingList <- function(capacity = 10) {
  buffer <- vector('list', capacity)
  length <- 0
  
  methods <- list()
  
  methods$double.size <- function() {
    buffer <<- c(buffer, vector('list', capacity))
    capacity <<- capacity * 2
  }
  
  methods$add <- function(val) {
    if(length == capacity) {
      methods$double.size()
    }
    
    length <<- length + 1
    buffer[[length]] <<- val
  }
  
  methods$as.list <- function() {
    b <- buffer[0:length]
    return(b)
  }
  
  methods
}
SL.library<-expandingList()

# setup xgboost
xgboost.tune <- list(ntrees = c(1000),
                     max_depth = c(20, 30),
                     minobspernode = c(30),
                     shrinkage = c(0.05))

# Set detailed names = T so we can see the configuration for each function.
# Also shorten the name prefix.
xgboost <- create.Learner("SL.xgboost", tune = xgboost.tune, detailed_names = T, name_prefix = "xgboost")
for(i in 1:length(xgboost$names)){
  SL.library$add(c(xgboost$names[i]))
}

fit.gaselec.SL = SuperLearner(Y=train_y, X=train_x,
                                  SL.library=SL.library$as.list(), family=gaussian(),
                                  method="method.NNLS", verbose=TRUE, cvControl = list(V = 5))
saveRDS(fit.gaselec.SL, paste0(arg1, "/Results/Model_Outputs/predictpre_model_best.rds"))

print(fit.gaselec.SL)

## In-sample predictions
predictions = predict(fit.gaselec.SL, ihwap_completeX, onlySL = T)
ihwap_complete$pred_best = unlist(as.numeric(predictions$pred))

## Cross-validated predictions
ihwap_complete[which(ihwap_complete$treated==0), c("pred_cv")] = as.data.frame(fit.gaselec.SL$Z)

## Exporting results
write.csv(ihwap_complete, paste0(arg1, "/Results/Model_Outputs/CV_predictpre_best.csv"), row.names = FALSE)

